import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import plotly.express as px
import missingno as msno
#################################################################
##### Setting pandas display options for pretty print
#################################################################
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.max_colwidth', 50)
# COVID data URL picked from https://github.com/owid/covid-19-data/tree/master/public/data
COVID_DATA_URL = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
COVID_DATA_FILE_NAME = "owid-covid-data.csv"
COVID_DATA_DICTIONARY_URL = "https://covid.ourworldindata.org/data/owid-covid-codebook.csv"
COVID_DATA_DICTIONARY_FILE_NAME = "owid-covid-codebook.csv"
RESULTS_PATH = "EDS_Covid19_Exp_results"
DATA_FOLDER = "data"
covid_data_full_file_path = os.path.join(RESULTS_PATH,
DATA_FOLDER,
COVID_DATA_FILE_NAME)
covid_data_dictionary_full_file_path = os.path.join(RESULTS_PATH,
DATA_FOLDER,
COVID_DATA_DICTIONARY_FILE_NAME)
# Creating experiment folders as required
if not os.path.isdir(RESULTS_PATH):
os.makedirs(RESULTS_PATH)
if not os.path.isdir(os.path.join(RESULTS_PATH,
DATA_FOLDER)):
os.makedirs(os.path.join(RESULTS_PATH,
DATA_FOLDER))
#################################################################
##### Covid data
#################################################################
if os.path.isfile(covid_data_full_file_path):
print("Local data file available:", covid_data_full_file_path)
df_full_covid_data = pd.read_csv(covid_data_full_file_path,
sep=",",
header=0)
print("Data load complete.")
else:
print("No local data file found. Downloading from URL:", COVID_DATA_URL)
df_full_covid_data = pd.read_csv(COVID_DATA_URL,
sep=",",
header=0)
df_full_covid_data.to_csv(covid_data_full_file_path,
sep=",",
header=True,
index=False)
print("Data downloaded and saved to:", covid_data_full_file_path)
#################################################################
##### Covid data dictionary
#################################################################
if os.path.isfile(covid_data_dictionary_full_file_path):
print("\nLocal dictionary file available:", covid_data_dictionary_full_file_path)
df_covid_data_dict = pd.read_csv(covid_data_dictionary_full_file_path,
sep=",",
header=0)
print("Dictionary load complete.")
else:
print("\nNo local dictionary file found. Downloading from URL:", COVID_DATA_DICTIONARY_URL)
df_covid_data_dict = pd.read_csv(COVID_DATA_DICTIONARY_URL,
sep=",",
header=0)
df_covid_data_dict.to_csv(covid_data_dictionary_full_file_path,
sep=",",
header=True,
index=False)
print("Dictionary downloaded and saved to:", covid_data_dictionary_full_file_path)
Local data file available: EDS_Covid19_Exp_results\data\owid-covid-data.csv Data load complete. Local dictionary file available: EDS_Covid19_Exp_results\data\owid-covid-codebook.csv Dictionary load complete.
#################################################################
##### codebook - contains details of every column specified by data provider
#################################################################
# df_covid_data_dict[['column', 'description']]
df_full_covid_data.shape
(197003, 67)
# Required fields for Delivery 1 and 2
required_fields = ['location', 'date',
'total_cases', 'population',
'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']
print('Non-country locations:\n', df_full_covid_data[df_full_covid_data['continent'].isnull()]['location'].unique())
Non-country locations: ['Africa' 'Asia' 'Europe' 'European Union' 'High income' 'International' 'Low income' 'Lower middle income' 'North America' 'Oceania' 'South America' 'Upper middle income' 'World']
df_full_covid_data = df_full_covid_data[df_full_covid_data['continent'].notnull()].reset_index(drop=True)
msno.bar(df_full_covid_data)
<AxesSubplot:>
msno.bar(df_full_covid_data[required_fields])
<AxesSubplot:>
msno.matrix(df_full_covid_data)
<AxesSubplot:>
msno.matrix(df_full_covid_data[required_fields])
<AxesSubplot:>
msno.dendrogram(df_full_covid_data)
<AxesSubplot:>
msno.dendrogram(df_full_covid_data[required_fields])
<AxesSubplot:>
df_missing_vaccination_by_country = df_full_covid_data[[required_fields[i] for i in [0, -3, -2, -1]]].drop('location',
axis=1).isna().groupby(df_full_covid_data.location).sum().reset_index()
df_missing_vaccination_by_country['location_records'] = df_full_covid_data['location'].value_counts().reset_index().sort_values(by='index')['location'].reset_index(drop=True)
df_missing_vaccination_by_country['people_vaccinated'] = df_missing_vaccination_by_country['people_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['people_fully_vaccinated'] = df_missing_vaccination_by_country['people_fully_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['total_boosters'] = df_missing_vaccination_by_country['total_boosters']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['mean_total_missing'] = (df_missing_vaccination_by_country['people_vaccinated'] + df_missing_vaccination_by_country['people_fully_vaccinated'] + df_missing_vaccination_by_country['total_boosters']) / 3
fig = px.bar(df_missing_vaccination_by_country, x='location', y='mean_total_missing')
fig.show()
df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True).head(10)
| location | people_vaccinated | people_fully_vaccinated | total_boosters | location_records | mean_total_missing | |
|---|---|---|---|---|---|---|
| 111 | Latvia | 0.349 | 0.365 | 0.372 | 852 | 0.362 |
| 198 | Switzerland | 0.366 | 0.366 | 0.377 | 853 | 0.370 |
| 218 | United States | 0.373 | 0.373 | 0.400 | 887 | 0.382 |
| 63 | Estonia | 0.382 | 0.382 | 0.392 | 875 | 0.385 |
| 117 | Lithuania | 0.364 | 0.385 | 0.426 | 849 | 0.392 |
| 153 | Norway | 0.342 | 0.374 | 0.471 | 853 | 0.396 |
| 70 | France | 0.385 | 0.386 | 0.419 | 885 | 0.397 |
| 54 | Denmark | 0.365 | 0.381 | 0.445 | 876 | 0.397 |
| 7 | Argentina | 0.400 | 0.400 | 0.400 | 908 | 0.400 |
| 35 | Canada | 0.418 | 0.388 | 0.438 | 886 | 0.415 |
country_list = list(df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True)['location'].head(3))
df_3country_covid_data = df_full_covid_data[df_full_covid_data['location'].isin(country_list)][required_fields].reset_index(drop=True)
impute_column_list = ['total_cases', 'population',
'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']
msno.matrix(df_3country_covid_data)
<AxesSubplot:>
df_3country_covid_data_imputed = None
for current_country in list(df_3country_covid_data['location'].unique()):
temp_df_current_country_data = df_3country_covid_data[df_3country_covid_data['location'] == current_country]
temp_df_current_country_data = temp_df_current_country_data.sort_values(by='date', ascending=True)
for col in impute_column_list:
temp_df_current_country_data[col] = temp_df_current_country_data[col].interpolate(limit_direction="both")
if df_3country_covid_data_imputed is None:
df_3country_covid_data_imputed = temp_df_current_country_data
else:
df_3country_covid_data_imputed = pd.concat([df_3country_covid_data_imputed, temp_df_current_country_data],
ignore_index=True
)
msno.matrix(df_3country_covid_data_imputed)
<AxesSubplot:>
df_3country_covid_data_imputed['infector_percentage'] = df_3country_covid_data_imputed['total_cases']/df_3country_covid_data_imputed['population']
fig = px.line(df_3country_covid_data_imputed,
x='date', y='infector_percentage',
color='location',
# title = 'The relative cases overtime of Covid infectors'
)
fig.update_layout(
yaxis_title="Total cases as percentage of population",
xaxis_title="Timeline",
title = {
'text':'The total affected rate (percentage of the population) over time',
'x':0.45,
'xanchor': 'center',
'yanchor': 'top'
},
legend = {
'title':{'text':'Location'}
}
)
fig.show(renderer='notebook')
df_3country_covid_data_imputed['vaccination_dose1_percentage'] = df_3country_covid_data_imputed['people_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose2_percentage'] = df_3country_covid_data_imputed['people_fully_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose3_percentage'] = df_3country_covid_data_imputed['total_boosters']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed_Transformed = None
for dose in range(1,4):
col_list = ['location', 'date', 'vaccination_dose{}_percentage'.format(dose)]
temp_df = df_3country_covid_data_imputed[col_list]
temp_df['location'] = temp_df['location']+', '+str(dose)
temp_df.columns = ['location, dose', 'date', 'vaccination_percentage']
if df_3country_covid_data_imputed_Transformed is None:
df_3country_covid_data_imputed_Transformed = temp_df
else:
df_3country_covid_data_imputed_Transformed = pd.concat([df_3country_covid_data_imputed_Transformed, temp_df],#
ignore_index=True
)
fig = None
fig = px.line(df_3country_covid_data_imputed_Transformed,
x='date', y='vaccination_percentage',
color='location, dose',
)
fig.update_layout(
yaxis_title="Percentage of population vaccinated",
xaxis_title="Timeline",
title = {
'text':'The vaccination rate (percentage of the population) over time',
'x':0.45,
'xanchor': 'center',
'yanchor': 'top'
},
legend = {
'title':{'text':'Location, Dosage'}
}
)
fig.show(renderer='notebook')